/* 
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

using System.Collections.Generic;

namespace Lucene.Net.Analysis
{
    
    /// <summary> Loader for text files that represent a list of stopwords.</summary>
    public class WordlistLoader
    {
        
        /// <summary> Loads a text file and adds every line as an entry to a HashSet (omitting
        /// leading and trailing whitespace). Every line of the file should contain only
        /// one word. The words need to be in lowercase if you make use of an
        /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
        /// </summary>
        /// <param name="wordfile">File containing the wordlist</param>
        /// <returns> A HashSet with the file's words</returns>
        public static ISet<string> GetWordSet(System.IO.FileInfo wordfile)
        {
            using (var reader = new System.IO.StreamReader(wordfile.FullName, System.Text.Encoding.Default))
            {
                return GetWordSet(reader);
            }
        }
        
        /// <summary> Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
        /// leading and trailing whitespace). Every line of the file should contain only
        /// one word. The words need to be in lowercase if you make use of an
        /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
        /// </summary>
        /// <param name="wordfile">File containing the wordlist</param>
        /// <param name="comment">The comment string to ignore</param>
        /// <returns> A HashSet with the file's words</returns>
        public static ISet<string> GetWordSet(System.IO.FileInfo wordfile, System.String comment)
        {
            using (var reader = new System.IO.StreamReader(wordfile.FullName, System.Text.Encoding.Default))
            {
                return GetWordSet(reader, comment);
            }
        }
        
        
        /// <summary> Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
        /// leading and trailing whitespace). Every line of the Reader should contain only
        /// one word. The words need to be in lowercase if you make use of an
        /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
        /// </summary>
        /// <param name="reader">Reader containing the wordlist</param>
        /// <returns>A HashSet with the reader's words</returns>
        public static ISet<string> GetWordSet(System.IO.TextReader reader)
        {
            var result = Support.Compatibility.SetFactory.CreateHashSet<string>();

            System.String word;
            while ((word = reader.ReadLine()) != null)
            {
                result.Add(word.Trim());
            }

            return result;
        }

        /// <summary> Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
        /// leading and trailing whitespace). Every line of the Reader should contain only
        /// one word. The words need to be in lowercase if you make use of an
        /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
        /// 
        /// </summary>
        /// <param name="reader">Reader containing the wordlist
        /// </param>
        /// <param name="comment">The string representing a comment.
        /// </param>
        /// <returns> A HashSet with the reader's words
        /// </returns>
        public static ISet<string> GetWordSet(System.IO.TextReader reader, System.String comment)
        {
            var result = Support.Compatibility.SetFactory.CreateHashSet<string>();

            System.String word = null;
            while ((word = reader.ReadLine()) != null)
            {
                if (word.StartsWith(comment) == false)
                {
                    result.Add(word.Trim());
                }
            }

            return result;
        }



        /// <summary> Reads a stem dictionary. Each line contains:
        /// <c>word<b>\t</b>stem</c>
        /// (i.e. two tab seperated words)
        /// 
        /// </summary>
        /// <returns> stem dictionary that overrules the stemming algorithm
        /// </returns>
        /// <throws>  IOException  </throws>
        public static Dictionary<string, string> GetStemDict(System.IO.FileInfo wordstemfile)
        {
            if (wordstemfile == null)
                throw new System.NullReferenceException("wordstemfile may not be null");
            var result = new Dictionary<string, string>();
            System.IO.StreamReader br = null;
            System.IO.StreamReader fr = null;
            try
            {
                fr = new System.IO.StreamReader(wordstemfile.FullName, System.Text.Encoding.Default);
                br = new System.IO.StreamReader(fr.BaseStream, fr.CurrentEncoding);
                System.String line;
                char[] tab = {'\t'};
                while ((line = br.ReadLine()) != null)
                {
                    System.String[] wordstem = line.Split(tab, 2);
                    result[wordstem[0]] = wordstem[1];
                }
            }
            finally
            {
                if (fr != null)
                    fr.Close();
                if (br != null)
                    br.Close();
            }
            return result;
        }
    }
}